Click Predictor
import pandas as pd
import seaborn as sns
import plotly.express as px
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
data = pd.read_csv("train.csv")
test_data = pd.read_csv('test.csv')
data.head()
data.columns
Index(['id', 'Timestamp', 'Daily Time Spent on Site', 'Age', 'Area Income',
'Daily Internet Usage', 'Ad Topic Line', 'gender', 'Country',
'Clicked'],
dtype='object')

Check for null values

data.isnull().sum()

Replace nulls in Area Income with the Median of Area Income

data['Area Income']=data['Area Income'].fillna(data['Area Income'].median())
data.isnull().sum()

Create a column to sum "not_clicked"

data['no_click']=data.Clicked.apply(lambda x: 1 if (x==0else 0)
data.head()

Look at Clicked vs Not Clicked by Gender

gender=data.groupby("gender").sum().reset_index()
fig=px.bar(gender, x='gender'y=['Clicked''no_click'], title="Click Rate by Gender")
fig.show()
−0.500.511.50100200300400500
variableClickedno_clickClick Rate by Gendergendervalue

Examine time on site and how that affect the click rate

ax=sns.displot(data, x='Daily Time Spent on Site'hue='Clicked'multiple='dodge')
ax.set(title="Clicks by Time on Site")
<seaborn.axisgrid.FacetGrid at 0x7f7cac302760>

Create a true "date" column with a date data type to look at day of week vs clicks

data['date']=data.Timestamp.apply(lambda x:x[0:10])
data.head()
data.date=data.date.astype(np.datetime64)
data['day']=data['date'].dt.day_name()
data.head()
dayofweek=data.groupby('day').sum().reset_index()
dayofweek.head(7)
fig=px.bar(dayofweek, x='day',y=['Clicked''no_click'], title='Clicks by Day of Week',category_orders={'day':['Sunday''Monday''Tuesday''Wednesday''Thursday''Friday''Saturday']}, labels={'day':'Day of Week''value':'Count''variable':'Click Rate'})
fig.show()
SundayMondayTuesdayWednesdayThursdayFridaySaturday050100150
Click RateClickedno_clickClicks by Day of WeekDay of WeekCount

It doesn't look like the day of week is a significant factor in clicks although Tuesdays and Saturday are less likely to produce clicks. Thursdays and Friday have slightly more clicks than the remaining days.

Break down Time Spent on Site into categories and evaluate the impact on clicks

def timemap(x):
    if x<30:
        return 20
    elif x >=30 and x<40:
        return 30
    elif x>=40 and x <50:
        return 40
    elif x>=50 and x < 60:
        return 50
    elif x >= 60 and x<70:
        return 60
    elif x >= 70 and x<80:
        return 70
    elif x >= 80 and x<90:
        return 80
    else:
        return 90
data['site_time']=data['Daily Time Spent on Site'].apply(timemap)
data.head()
site=data.groupby('site_time').sum().reset_index()
site.head()
fig=px.bar(site, x='site_time',y=['Clicked','no_click'],title="Clicks by Time Spent on Site", \
           labels={'site_time':'Time Spent (minutes)','value':'Count Click/No Click''variable':'Click Rate'})

fig.show()
30405060708090050100150200250
Click RateClickedno_clickClicks by Time Spent on SiteTime Spent (minutes)Count Click/No Click

Those who spend less than an hour on the site are far more likely to click than those who spend more time. 30 mnutes or less has the highest click rate by far.

Look at how age affects click rates

ax=sns.displot(data, x='Age'hue='Clicked'multiple="dodge")
ax.set(xlabel="Age")
<seaborn.axisgrid.FacetGrid at 0x7f7cac1c3dc0>

Create Age Groups to see data more clearly

def agegroups(x):
    if x<30:
        return  "20-29"
    elif x>=30 and x<40:
        return "30-39"
    elif x >=40 and x<50:
        return "40-49"
    elif x>=50 and x<60:
        return "50-59"
    else:
        return "60+"
data['age_groups']=data['Age'].apply(agegroups)
data.head()
ages=data.groupby('age_groups').sum().reset_index()
ages.head()
fig=px.bar(ages, x='age_groups',y=['Clicked','no_click'],title="Clicks by Age Groups", \
           labels={'age_groups':'Age Groups','value':'Count Click/No Click''variable':'Click Rate'})

fig.show()
20-2930-3940-4950-5960+0100200300400
Click RateClickedno_clickClicks by Age GroupsAge GroupsCount Click/No Click

The most clicks are generated by 30-39 year olds but their rate of clicks is the lowest, 40 and older have the highest rates of clicks.

Those who are 40 and older generate as many clicks as those younger but with a much higher ratio.

Examine how income levels relate to clicks

def inclevel(x):
    if x<30000:
        return "<30K"
    elif x>=30000 and x<40000:
        return "30-39K"
    elif x>=40000 and x <50000:
        return '40-49K'
    elif x>=50000 and x<60000:
        return '50-59K'
    elif x >=60000 and x <70000:
        return '60-69K'
    else:
        return '70K+'
data['income_level']=data['Area Income'].apply(inclevel)
income=data.groupby('income_level').sum().reset_index()
income.head()
fig=px.bar(income, x='income_level'y=['Clicked''no_click'], title='Income Levels and Clicks'labels={'income_level':"Income Groups"'value':"Count Click/No Click"'variable':'Click Rate'}, category_orders={'income_level':['<30K''30-39K''40-49K''50-59K''60-69K','70K+']})
fig.show()
<30K30-39K40-49K50-59K60-69K70K+0100200300400
Click RateClickedno_clickIncome Levels and ClicksIncome GroupsCount Click/No Click

Income levels below 50K have a significantly higher rate of clicks with about the same total clicks as those making 50K and above.

Examine how total time on the internet affects clicks

ax=sns.displot(data, x='Daily Internet Usage'hue='Clicked'multiple="dodge")
ax.set(xlabel="Time Spent on the Internet")
<seaborn.axisgrid.FacetGrid at 0x7f7cbed3e3d0>

There is an inverse relationship between total time on the internet and clicks

Create a new category for 175 minutes or less with a boolean value

data['total_time']=data['Daily Internet Usage']<=175
data['total_time'].value_counts()
total_t=data.groupby('total_time').sum().reset_index()
total_t.head()
fig=px.bar(total_t, x='total_time'y=['Clicked''no_click'], title="How total time on the internet affects clicks"labels={'total_time':"Spends 175 minutes or less per day on the internet"'value':'Count''variable':'Click Rate'})
fig.show()
falsetrue0100200300400500
Click RateClickedno_clickHow total time on the internet affects clicksSpends 175 minutes or less per day on the internetCount

Examine ad topic line in relation to clicks

topic=data.groupby('Ad Topic Line').sum().reset_index()
topic
effective1=topic[(topic['Clicked']>1) & (topic['no_click']==0)]
highly_effective=effective1['Ad Topic Line'].tolist()
len(highly_effective)
52
effective2=topic[(topic['Clicked']==1) & (topic['no_click']==0)]
effective=effective2['Ad Topic Line'].tolist()
len(effective)
326
effective3=topic[(topic['Clicked']<=1) & (topic['no_click']>0)]
ineffective=effective3['Ad Topic Line'].tolist()
len(ineffective)
447
effective4=topic[(topic['Clicked']<=1) & (topic['no_click']>0)]

ineff=effective4['Ad Topic Line'].tolist()
len(ineff)
447
effective4['no_click'].sum()
497
def effectiveness(x):
    if x in highly_effective:
        return "High"
    if x in effective:
        return "Medium"
    else:
        return "Low"
data['ad_effectiveness']=data['Ad Topic Line'].apply(effectiveness)
data['ad_effectiveness'].value_counts()

Create data sets for predictions

data.columns
Index(['id', 'Timestamp', 'Daily Time Spent on Site', 'Age', 'Area Income',
'Daily Internet Usage', 'Ad Topic Line', 'gender', 'Country', 'Clicked',
'no_click', 'date', 'day', 'site_time', 'age_groups', 'income_level',
'total_time', 'ad_effectiveness'],
dtype='object')
df=data.drop(columns=['id''Timestamp''Daily Time Spent on Site''Age''Area Income',
       'Daily Internet Usage''Ad Topic Line''Country''no_click''date'], axis=1)
df.head()
df.shape
(1000, 8)
df.columns
Index(['gender', 'Clicked', 'day', 'site_time', 'age_groups', 'income_level',
'total_time', 'ad_effectiveness'],
dtype='object')

Create response variable

df_y=df['Clicked']
df_y.shape
(1000,)

Create features


feature=['day''site_time''age_groups''income_level','total_time''ad_effectiveness']
df_X=pd.get_dummies(df[feature])
df_X.shape
(1000, 23)
df_X['gender']=df['gender']
df_X.shape
(1000, 24)

Set up Random Forest Classifier to make predictions

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(df_X, df_y, test_size=0.20random_state=7)
print(X_train.shape)
print(y_train.shape)
(800, 24)
(800,)

from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=75max_depth=5random_state=7)
rfc.fit(X_train, y_train)
prediction=rfc.predict(X_train)
rfc.score(X_test, y_test)
0.945

We can predict clicks based on data with 94.5% accuracy

Layer 1 Layer 1 Created using FigmaCreated using Figma ic-unifiedSplitView-2 Created using Figma Group ic-key Created using Figma Layer 1